import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
Buenas prácticas
Voy guardando las funciones que están automatizadas y pienso que me van a servir en otros proyectos en un funciones_auxiliares.py y lo importo:
def plot_feature(df, col_name, isContinuous, target):
"""
Visualize a variable with and without faceting on the loan status.
- df dataframe
- col_name is the variable name in the dataframe
- full_name is the full variable name
- continuous is True if the variable is continuous, False otherwise
"""
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df, x=col_name, color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
-Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las
variables categóricas
-Inputs:
-- dataset: Pandas dataframe que contiene los datos
-Return:
-- lista_variables_categoricas: lista con los nombres de las variables categóricas del
dataset de entrada con menos de 100 valores diferentes
-- 1: la ejecución es incorrecta
'''
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
"""
Devuelve el porcentaje de valores que exceden del intervalo de confianza
:type series:
:param multiplier:
:return:
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)] \
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop(target,axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(pd_loan, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if pd_loan[i].isnull().sum()>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()] \
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop(target,axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum()/pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def cramers_v(confusion_matrix):
"""
calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
confusion_matrix: tabla creada con pd.crosstab()
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
Leo el dataset
df_fraud = pd.read_csv("../data/pd_data_initial_preprocessing.csv").drop('Unnamed: 0',axis=1)
df_fraud.shape
(1000000, 32)
df_fraud.columns
Index(['intended_balcon_amount', 'prev_address_months_count',
'bank_months_count', 'current_address_months_count',
'session_length_in_minutes', 'fraud_bool', 'foreign_request',
'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit',
'device_os', 'source', 'housing_status', 'keep_alive_session',
'device_distinct_emails_8w', 'device_fraud_count', 'phone_home_valid',
'credit_risk_score', 'email_is_free', 'income', 'employment_status',
'date_of_birth_distinct_emails_4w', 'bank_branch_count_8w',
'velocity_4w', 'velocity_24h', 'velocity_6h', 'zip_count_4w',
'payment_type', 'days_since_request', 'customer_age',
'name_email_similarity', 'month'],
dtype='object')
list_var_cat, other = dame_variables_categoricas(dataset=df_fraud)
df_fraud[list_var_cat] = df_fraud[list_var_cat].astype("category")
list_var_continuous = list(df_fraud.select_dtypes('float').columns)
df_fraud[list_var_continuous] = df_fraud[list_var_continuous].astype(float)
df_fraud.dtypes
intended_balcon_amount float64 prev_address_months_count float64 bank_months_count float64 current_address_months_count float64 session_length_in_minutes float64 fraud_bool category foreign_request float64 phone_mobile_valid float64 has_other_cards float64 proposed_credit_limit float64 device_os category source category housing_status category keep_alive_session float64 device_distinct_emails_8w float64 device_fraud_count float64 phone_home_valid float64 credit_risk_score float64 email_is_free float64 income float64 employment_status category date_of_birth_distinct_emails_4w float64 bank_branch_count_8w float64 velocity_4w float64 velocity_24h float64 velocity_6h float64 zip_count_4w float64 payment_type category days_since_request float64 customer_age float64 name_email_similarity float64 month float64 dtype: object
Este código prepara el conjunto de datos de nuestro dataframe para el análisis exploratorio, asegurándonos de que las variables categóricas y las variables continuas estén en el formato correcto.
df_fraud_bool = df_fraud['fraud_bool'] \
.value_counts(normalize=True) \
.mul(100).rename('percent').reset_index()
df_fraud_bool_conteo = df_fraud['fraud_bool'].value_counts().reset_index()
df_fraud_bool_pc = pd.merge(df_fraud_bool, df_fraud_bool_conteo, on=['fraud_bool'], how='inner')
df_fraud_bool_pc
fig = px.histogram(df_fraud_bool_pc, x="fraud_bool", y=['percent'])
fig.update_xaxes(tickvals = [0, 1])
fig.show()
En este código creamos dos dataframes, el primero contiene el porcentaje de los valores de 'fraud_bool', y el segundo contiene el conteo absoluto de cada valor. Luego se fusionan esos dataframes en uno solo utilizando la columna de valores únicos 'index'. Por último, realizamos un gráfico que nos muestra las probabilidades de fraude, siendo 0 ausencia de fraude y 1 fraude.
from sklearn.model_selection import train_test_split
X_df_fraud, X_df_fraud_test, y_df_fraud, y_df_fraud_test = train_test_split(df_fraud.drop('fraud_bool',axis=1),
df_fraud['fraud_bool'],
stratify=df_fraud['fraud_bool'],
test_size=0.2)
df_fraud_train = pd.concat([X_df_fraud, y_df_fraud],axis=1)
df_fraud_test = pd.concat([X_df_fraud_test, y_df_fraud_test],axis=1)
Dividimos nuestro conjunto de datos en train y test, con una proporción del 80% y 20% respectivamente
print('== Train\n', df_fraud_train['fraud_bool'].value_counts(normalize=True))
print('== Test\n', df_fraud_test['fraud_bool'].value_counts(normalize=True))
== Train fraud_bool 0 0.988971 1 0.011029 Name: proportion, dtype: float64 == Test fraud_bool 0 0.98897 1 0.01103 Name: proportion, dtype: float64
pd_series_null_columns = df_fraud_train.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = df_fraud_train.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)
pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])
pd_null_filas['target'] = df_fraud['fraud_bool'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/df_fraud_train.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/df_fraud_train.shape[1]
(32,) (800000,)
Vemos el número de valores nulos por filas y por columnas
pd_null_columnas
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| intended_balcon_amount | 593804 | 0.742255 |
| prev_address_months_count | 570153 | 0.712691 |
| bank_months_count | 202916 | 0.253645 |
| current_address_months_count | 3442 | 0.004302 |
| session_length_in_minutes | 1598 | 0.001998 |
| velocity_6h | 0 | 0.000000 |
| date_of_birth_distinct_emails_4w | 0 | 0.000000 |
| bank_branch_count_8w | 0 | 0.000000 |
| velocity_4w | 0 | 0.000000 |
| velocity_24h | 0 | 0.000000 |
| payment_type | 0 | 0.000000 |
| zip_count_4w | 0 | 0.000000 |
| income | 0 | 0.000000 |
| days_since_request | 0 | 0.000000 |
| customer_age | 0 | 0.000000 |
| name_email_similarity | 0 | 0.000000 |
| month | 0 | 0.000000 |
| employment_status | 0 | 0.000000 |
| credit_risk_score | 0 | 0.000000 |
| email_is_free | 0 | 0.000000 |
| phone_home_valid | 0 | 0.000000 |
| device_fraud_count | 0 | 0.000000 |
| device_distinct_emails_8w | 0 | 0.000000 |
| keep_alive_session | 0 | 0.000000 |
| housing_status | 0 | 0.000000 |
| source | 0 | 0.000000 |
| device_os | 0 | 0.000000 |
| proposed_credit_limit | 0 | 0.000000 |
| has_other_cards | 0 | 0.000000 |
| phone_mobile_valid | 0 | 0.000000 |
| foreign_request | 0 | 0.000000 |
| fraud_bool | 0 | 0.000000 |
pd_null_filas.head()
| nulos_filas | target | porcentaje_filas | |
|---|---|---|---|
| 483460 | 4 | 0 | 0.125 |
| 525137 | 4 | 0 | 0.125 |
| 448512 | 4 | 0 | 0.125 |
| 779819 | 4 | 0 | 0.125 |
| 69027 | 4 | 0 | 0.125 |
Distribución del resto de variables
# warnings.filterwarnings('ignore')
for i in list(df_fraud_train.columns):
if (df_fraud_train[i].dtype==float) & (i!='fraud_bool'):
plot_feature(df_fraud_train, col_name=i, isContinuous=True, target='fraud_bool')
elif i!='fraud_bool':
plot_feature(df_fraud_train, col_name=i, isContinuous=False, target='fraud_bool')
C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:9: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\AppData\Local\Temp\ipykernel_24036\4282654840.py:28: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead C:\Users\jaime\anaconda3\envs\practica0\lib\site-packages\seaborn\categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
Se generan visualizaciones para las características del conjunto de datos, considerando si son de tipo float o categóricas, en relación con la variable objetivo 'fraud_bool'.
Por ejemplo, hay un gráfico de barras que compara la proporción de fraudes ('fraud_bool') en función del sistema operativo del dispositivo ('device_os'). Las barras representan el porcentaje de fraudes para cada sistema operativo, calculado sobre el total de observaciones en ese sistema. Se observa que las barras correspondientes al valor 0 son significativamente más altas en todos los sistemas operativos,indicando que la mayoría de las observaciones no son fraudes. Sin embargo, entre las barras del valor 1,las más altas se encuentran en los sistemas operativos 'Windows' y 'Macintosh', sugiriendo una posible asociaciónentre estos sistemas y un mayor porcentaje de fraudes.
Otro ejemplo es el gráfico de caja (boxplot) que compara la distribución del ingreso ('income') en función de la la proporción de fraudes ('fraud_bool'). Para las observaciones sin fraude, el boxplot muestra que la mayoría de los ingresos se encuentran entre 0.3 y 0.8, con una mediana en 0.6. Por otro lado, para las observaciones con fraude el rango de ingresos es más pequeño, y la mayoría de los valores se encuentran entre 0.6 y 0.9, situándose la median en 0.8. Se observa un outlier en 0.1. Este boxplot proporciona información sobre la variabilidad en los ingresos para las categorías de fraude y no fraude.
A continuación, se tratan los valores missing, las correlaciones de las variables continuas y los outliers
list_var_continuous
['intended_balcon_amount', 'prev_address_months_count', 'bank_months_count', 'current_address_months_count', 'session_length_in_minutes', 'foreign_request', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit', 'keep_alive_session', 'device_distinct_emails_8w', 'device_fraud_count', 'phone_home_valid', 'credit_risk_score', 'email_is_free', 'income', 'date_of_birth_distinct_emails_4w', 'bank_branch_count_8w', 'velocity_4w', 'velocity_24h', 'velocity_6h', 'zip_count_4w', 'days_since_request', 'customer_age', 'name_email_similarity', 'month']
get_deviation_of_mean_perc(df_fraud_train, list_var_continuous, target='fraud_bool', multiplier=3)
| 0.0 | 1.0 | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.989886 | 0.010114 | intended_balcon_amount | 1582 | 0.001978 |
| 1 | 0.993456 | 0.006544 | prev_address_months_count | 6877 | 0.008596 |
| 2 | 0.983702 | 0.016298 | current_address_months_count | 17119 | 0.021399 |
| 3 | 0.978821 | 0.021179 | session_length_in_minutes | 18839 | 0.023549 |
| 4 | 0.978306 | 0.021694 | foreign_request | 20328 | 0.025410 |
| 5 | 0.869619 | 0.130381 | proposed_credit_limit | 4878 | 0.006097 |
| 6 | 0.961725 | 0.038275 | device_distinct_emails_8w | 25500 | 0.031875 |
| 7 | 0.964863 | 0.035137 | credit_risk_score | 2846 | 0.003557 |
| 8 | 0.993676 | 0.006324 | date_of_birth_distinct_emails_4w | 5060 | 0.006325 |
| 9 | 0.989364 | 0.010636 | bank_branch_count_8w | 32720 | 0.040900 |
| 10 | 0.997706 | 0.002294 | velocity_24h | 436 | 0.000545 |
| 11 | 0.993050 | 0.006950 | velocity_6h | 3453 | 0.004316 |
| 12 | 0.990533 | 0.009467 | zip_count_4w | 12993 | 0.016241 |
| 13 | 0.987631 | 0.012369 | days_since_request | 14229 | 0.017786 |
| 14 | 0.956782 | 0.043218 | customer_age | 6340 | 0.007925 |
Los valores outlier se pueden sustituir por la media, mediana, valores extremos (media+3std o media-3std). Tras el siguiente análisis, hemos decidido como primera iteración dejarlos sin sustituir. Una vez llegue al modelo puedo realizar iteraciones utilizando diferentes métodos para comprobar si mejora el modelo
get_corr_matrix(dataset = df_fraud_train[list_var_continuous],
metodo='pearson', size_figure=[10,8])
0
Este código genera y muestra una matriz de correlación entre variables continuas en el conjunto de datos. En el gráfico resultante, la mayoría de los puntos están cercanos a 0, indicando una baja correlación entre las variables. Destaca una correlación más notable entre 'credit_risk_score' y 'proposed_credit_limit'. Esto sugiere que un puntaje de riesgo crediticio más alto tiende a asociarse con un límite de crédito propuesto más alto, lo cual tiene sentido desde una perspectiva financiera.
corr = df_fraud_train[list_var_continuous].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 643 | month | velocity_4w | 0.848145 |
| 334 | credit_risk_score | proposed_credit_limit | 0.605456 |
El código identifica y muestra los pares de variables continuas que tienen una correlación absoluta mayor a 0.6, lo que podría indicar una relación más fuerte entre esas variables.
¿Son todos los nulos de una clase de la variable objetivo? o tienen el mismo porcentaje de la variable objetivo?
list_var_continuous
['intended_balcon_amount', 'prev_address_months_count', 'bank_months_count', 'current_address_months_count', 'session_length_in_minutes', 'foreign_request', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit', 'keep_alive_session', 'device_distinct_emails_8w', 'device_fraud_count', 'phone_home_valid', 'credit_risk_score', 'email_is_free', 'income', 'date_of_birth_distinct_emails_4w', 'bank_branch_count_8w', 'velocity_4w', 'velocity_24h', 'velocity_6h', 'zip_count_4w', 'days_since_request', 'customer_age', 'name_email_similarity', 'month']
get_percent_null_values_target(df_fraud_train, list_var_continuous, target='fraud_bool')
| 0.0 | 1.0 | variable | sum_null_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.986863 | 0.013137 | intended_balcon_amount | 593804 | 0.742255 |
| 1 | 0.985802 | 0.014198 | prev_address_months_count | 570153 | 0.712691 |
| 2 | 0.983846 | 0.016154 | bank_months_count | 202916 | 0.253645 |
| 3 | 0.996223 | 0.003777 | current_address_months_count | 3442 | 0.004302 |
| 4 | 0.989362 | 0.010638 | session_length_in_minutes | 1598 | 0.001998 |
Algunos algoritmos aceptan en su input valores missing
Eliminar todas las filas que tengan valores nulos. En nuestro dataset no es lo más optimo debido a que hay bastantes filas que les ocurre esta situación
Imputar los valores missing por:
media mediana maximo minimo valores extremos https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
Decido rellenar todas las columnas continuas menos session_length_in_minutes por el valor -99. De esta manera, diferencio los outlier del resto de la muestra poninendo un valor muy separado del resto de la variable. Se puede explorar el resultado del modelo utilizando diferentes métodos
list_vars = list(set(list_var_continuous)-set(['session_length_in_minutes']))
df_fraud_train[list_vars] = df_fraud_train[list_vars].fillna(-99)
df_fraud_test[list_vars] = df_fraud_test[list_vars].fillna(-99)
list_vars
['device_distinct_emails_8w', 'velocity_4w', 'prev_address_months_count', 'velocity_6h', 'email_is_free', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'foreign_request', 'credit_risk_score', 'velocity_24h', 'name_email_similarity', 'income', 'phone_mobile_valid', 'proposed_credit_limit', 'phone_home_valid', 'current_address_months_count', 'intended_balcon_amount', 'bank_months_count', 'zip_count_4w', 'customer_age', 'month', 'keep_alive_session', 'device_fraud_count', 'has_other_cards', 'days_since_request']
df_fraud_test['session_length_in_minutes'].isnull().sum()
417
get_percent_null_values_target(df_fraud_test, list_var_continuous, target='fraud_bool')
| 0.0 | 1.0 | variable | sum_null_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.997602 | 0.002398 | session_length_in_minutes | 417 | 0.002085 |
https://scikit-learn.org/stable/modules/impute.html. Utilizar un modelo de regresión para rellenar los valores mmissing de alguna variable muy importante, por ejemplo: KNN, regresion lineal, xgboost. Pero, cuidado con el sobreajuste. Vamos a usar KNNImputer para imputar los valores missing de la variable emp_length usando como regresoras todas las variables continuas
X_train = df_fraud_train[list(set(list_var_continuous))]
X_test = df_fraud_test[list(set(list_var_continuous))]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
model = imputer.fit(X_train)
pd_input_train = pd.DataFrame(model.transform(X_train),
columns=[i+'_input' for i in list(set(list_var_continuous))],index=df_fraud_train.index)
pd_input_test = pd.DataFrame(model.transform(X_test),
columns=[i+'_input' for i in list(set(list_var_continuous))],index=df_fraud_test.index)
df_fraud_input_train = pd.concat([df_fraud_train, pd_input_train],axis=1).drop(list(set(list_var_continuous)),axis=1)
df_fraud_input_test = pd.concat([df_fraud_test, pd_input_test],axis=1).drop(list(set(list_var_continuous)),axis=1)
El código utiliza KNN para imputar los valores faltantes en las variables continuas del conjunto de datos, creando nuevos conjuntos de datos (df_fraud_input_train y df_fraud_input_test) con los valores imputados.
df_fraud_input_train.shape
(800000, 32)
get_percent_null_values_target(df_fraud_input_train, [i+'_input' for i in list_var_continuous], target='fraud_bool')
No existen variables con valores nulos
list_var_continuous = list(df_fraud_input_train.select_dtypes('float').columns)
get_corr_matrix(dataset = df_fraud_input_train[list_var_continuous],
metodo='pearson', size_figure=[10,8])
0
Este código selecciona las variables continuas imputadas en el train, y genera una matriz de correlación para explorar las relaciones lineales entre estas varibales, como realizamos en uno de los pasos anteriores.
df_fraud_input_train.columns
Index(['device_os', 'source', 'housing_status', 'employment_status',
'payment_type', 'fraud_bool', 'prev_address_months_count_input',
'bank_branch_count_8w_input', 'credit_risk_score_input',
'velocity_24h_input', 'income_input',
'current_address_months_count_input', 'bank_months_count_input',
'zip_count_4w_input', 'customer_age_input', 'month_input',
'keep_alive_session_input', 'device_fraud_count_input',
'device_distinct_emails_8w_input', 'velocity_4w_input',
'velocity_6h_input', 'email_is_free_input',
'date_of_birth_distinct_emails_4w_input', 'foreign_request_input',
'name_email_similarity_input', 'phone_mobile_valid_input',
'proposed_credit_limit_input', 'phone_home_valid_input',
'intended_balcon_amount_input', 'session_length_in_minutes_input',
'has_other_cards_input', 'days_since_request_input'],
dtype='object')
Para la correlacion de spearman es necesario convertir las variables categoricas en numericas y luego obtener la correlación
list_var_cat
['fraud_bool', 'device_os', 'source', 'housing_status', 'employment_status', 'payment_type']
confusion_matrix = pd.crosstab(df_fraud_input_train["fraud_bool"], df_fraud_input_train["device_os"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
device_os linux macintosh other windows x11 fraud_bool 0 264784 42416 272503 205774 5700 1 1384 590 1551 5239 59
0.08110361070236907
Este código nos devuelve la matriz de confusión y el valor del coeficiente Cramér's V, que indica la fuerza de la asociación entre las variables categóricas "fraud_bool" y "device_os". Un valor cercano a 0 sugiere una asociación débil, mientras que un valor cercano a 1 indica una asociación más fuerte.
confusion_matrix = pd.crosstab(df_fraud_input_train["fraud_bool"], df_fraud_input_train["fraud_bool"])
cramers_v(confusion_matrix.values)
0.9999426978916621
En este paso, estamos comparando 'fraud_bool' consigo misma, por lo que el valor que nos da es 1 aproximado, ya qye es una asociación perfecta, auqnue no es muy informativa.
confusion_matrix = pd.crosstab(df_fraud_input_train["fraud_bool"], df_fraud_input_train["source"])
cramers_v(confusion_matrix.values)
0.0044690710571317705
En este código comparamos 'fraud_bool' con la variable 'source'
En las variables categoricas, los valores nulos se suelen sustituir por una nueva clase: "sin valor" o por la moda
df_fraud_input_train[list_var_cat] = df_fraud_input_train[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
df_fraud_input_test[list_var_cat] = df_fraud_input_test[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
df_fraud_input_train.to_csv("../data/train_pd_data_preprocessing_missing_outlier.csv")
df_fraud_input_test.to_csv("../data/test_pd_data_preprocessing_missing_outlier.csv")